import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as st
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,recall_score,precision_score,roc_auc_score,f1_score,confusion_matrix,classification_report,plot_confusion_matrix,cohen_kappa_score
from sklearn.metrics import plot_roc_curve
import warnings
warnings.filterwarnings('ignore')
data1 = pd.read_csv('Loan_Details_transactions.csv')
data2 = pd.read_csv('Loan_status_mapping.csv')
data3=pd.read_excel('City_Master.xlsx')
data4=pd.read_csv('Branch_ID_Master.csv')
data5=pd.read_excel('State_Master.xlsx')
data6=pd.read_excel('Region_Master.xlsx')
data7=pd.read_excel('Postal_Code_Master.xlsx')
data8=pd.read_excel('State_Region_Mapping.xlsx')
output1 = pd.merge(data1, data2,on='Loan_Id',how='inner')
output1.head()
| Loan_Id | Disbursed_Amount | Asset_Cost | ltv | Branch_Id | City_Code | State_Code | Postal_Code | Date_of_Birth | Employment_Type | ... | Aadhar_flag | PAN_flag | VoterID_flag | Driving_flag | Passport_flag | PERFORM_CNS.SCORE | DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS | CREDIT.HISTORY.LENGTH | NO.OF_INQUIRIES | Loan_Default | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 36439 | 65850 | 56.19 | 64 | CT01 | ST15 | 400001 | 14-06-1990 | Self employed | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0yrs 0mon | 0 | 0 |
| 1 | 2 | 48749 | 69303 | 72.15 | 67 | CT03 | ST24 | 600020 | 01-01-1991 | Salaried | ... | 1 | 0 | 0 | 0 | 0 | 783 | 0 | 2yrs 6mon | 0 | 0 |
| 2 | 3 | 55348 | 66340 | 85.00 | 2 | CT13 | ST04 | 801110 | 16-08-1993 | Self employed | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0yrs 0mon | 0 | 1 |
| 3 | 4 | 48849 | 64133 | 77.96 | 217 | CT06 | ST22 | 302029 | 01-01-1989 | Self employed | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0yrs 0mon | 0 | 0 |
| 4 | 5 | 40394 | 59386 | 70.72 | 74 | CT03 | ST24 | 600020 | 31-12-1974 | Self employed | ... | 1 | 0 | 0 | 0 | 0 | 14 | 1 | 3yrs 1mon | 1 | 0 |
5 rows × 22 columns
output2 = pd.merge(data3, data7,on='City_Code',how='inner')
output2.head()
| City_Code | State_Code | City_Name_x | City_Name_y | Postal_Code | |
|---|---|---|---|---|---|
| 0 | CT01 | ST15 | Mumbai | Mumbai | 400001 |
| 1 | CT01 | ST15 | Mumbai | Mumbai | 400053 |
| 2 | CT01 | ST15 | Mumbai | Mumbai | 422001 |
| 3 | CT02 | ST15 | Pune | Pune | 411002 |
| 4 | CT02 | ST15 | Pune | Pune | 411006 |
output3 = pd.merge(output2, data5, on='State_Code', how='inner')
output3.head()
| City_Code | State_Code | City_Name_x | City_Name_y | Postal_Code | State_Name | |
|---|---|---|---|---|---|---|
| 0 | CT01 | ST15 | Mumbai | Mumbai | 400001 | Maharashtra |
| 1 | CT01 | ST15 | Mumbai | Mumbai | 400053 | Maharashtra |
| 2 | CT01 | ST15 | Mumbai | Mumbai | 422001 | Maharashtra |
| 3 | CT02 | ST15 | Pune | Pune | 411002 | Maharashtra |
| 4 | CT02 | ST15 | Pune | Pune | 411006 | Maharashtra |
output4 = pd.merge(output3, data8, on='State_Code',how='inner')
output4.head()
| City_Code | State_Code | City_Name_x | City_Name_y | Postal_Code | State_Name | Region | State Name | Region_ID | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | CT01 | ST15 | Mumbai | Mumbai | 400001 | Maharashtra | West | Maharashtra | R4 |
| 1 | CT01 | ST15 | Mumbai | Mumbai | 400053 | Maharashtra | West | Maharashtra | R4 |
| 2 | CT01 | ST15 | Mumbai | Mumbai | 422001 | Maharashtra | West | Maharashtra | R4 |
| 3 | CT02 | ST15 | Pune | Pune | 411002 | Maharashtra | West | Maharashtra | R4 |
| 4 | CT02 | ST15 | Pune | Pune | 411006 | Maharashtra | West | Maharashtra | R4 |
output5 = pd.merge(data6, output4,on='Region_ID', how='inner')
output5.head()
| Region_ID | Region_Name | City_Code | State_Code | City_Name_x | City_Name_y | Postal_Code | State_Name | Region | State Name | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | R1 | North | CT06 | ST22 | Jaipur | Jaipur | 302029 | Rajasthan | North | Rajasthan |
| 1 | R1 | North | CT06 | ST22 | Jaipur | Jaipur | 302013 | Rajasthan | North | Rajasthan |
| 2 | R1 | North | CT09 | ST34 | Delhi | Delhi | 110039 | Delhi | North | Delhi |
| 3 | R1 | North | CT18 | ST21 | Ludhiana | Ludhiana | 141003 | Punjab | North | Punjab |
| 4 | R2 | South | CT03 | ST24 | Chennai | Chennai | 600020 | Tamil Nadu | South | Tamil Nadu |
df = pd.merge(output1, output5,on='City_Code', how='inner')
df.head()
| Loan_Id | Disbursed_Amount | Asset_Cost | ltv | Branch_Id | City_Code | State_Code_x | Postal_Code_x | Date_of_Birth | Employment_Type | ... | Loan_Default | Region_ID | Region_Name | State_Code_y | City_Name_x | City_Name_y | Postal_Code_y | State_Name | Region | State Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 36439 | 65850 | 56.19 | 64 | CT01 | ST15 | 400001 | 14-06-1990 | Self employed | ... | 0 | R4 | West | ST15 | Mumbai | Mumbai | 400001 | Maharashtra | West | Maharashtra |
| 1 | 1 | 36439 | 65850 | 56.19 | 64 | CT01 | ST15 | 400001 | 14-06-1990 | Self employed | ... | 0 | R4 | West | ST15 | Mumbai | Mumbai | 400053 | Maharashtra | West | Maharashtra |
| 2 | 1 | 36439 | 65850 | 56.19 | 64 | CT01 | ST15 | 400001 | 14-06-1990 | Self employed | ... | 0 | R4 | West | ST15 | Mumbai | Mumbai | 422001 | Maharashtra | West | Maharashtra |
| 3 | 10 | 34639 | 69717 | 50.49 | 34 | CT01 | ST15 | 400001 | 23-11-1982 | Self employed | ... | 1 | R4 | West | ST15 | Mumbai | Mumbai | 400001 | Maharashtra | West | Maharashtra |
| 4 | 10 | 34639 | 69717 | 50.49 | 34 | CT01 | ST15 | 400001 | 23-11-1982 | Self employed | ... | 1 | R4 | West | ST15 | Mumbai | Mumbai | 400053 | Maharashtra | West | Maharashtra |
5 rows × 31 columns
df.columns
Index(['Loan_Id', 'Disbursed_Amount', 'Asset_Cost', 'ltv', 'Branch_Id',
'City_Code', 'State_Code_x', 'Postal_Code_x', 'Date_of_Birth',
'Employment_Type', 'DisbursalDate', 'MobileNo_Avl_Flag', 'Aadhar_flag',
'PAN_flag', 'VoterID_flag', 'Driving_flag', 'Passport_flag',
'PERFORM_CNS.SCORE', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
'CREDIT.HISTORY.LENGTH', 'NO.OF_INQUIRIES', 'Loan_Default', 'Region_ID',
'Region_Name', 'State_Code_y', 'City_Name_x', 'City_Name_y',
'Postal_Code_y', 'State_Name', 'Region', 'State Name'],
dtype='object')
df.drop(['State_Name','Postal_Code_x','State_Code_x','City_Name_y','Region_Name'],inplace=True,axis=1)
df['Self employed'] = pd.get_dummies(df['Employment_Type'],drop_first=True)
df.head()
| Loan_Id | Disbursed_Amount | Asset_Cost | ltv | Branch_Id | City_Code | Date_of_Birth | Employment_Type | DisbursalDate | MobileNo_Avl_Flag | ... | CREDIT.HISTORY.LENGTH | NO.OF_INQUIRIES | Loan_Default | Region_ID | State_Code_y | City_Name_x | Postal_Code_y | Region | State Name | Self employed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 36439 | 65850 | 56.19 | 64 | CT01 | 14-06-1990 | Self employed | 28-09-2018 | 1 | ... | 0yrs 0mon | 0 | 0 | R4 | ST15 | Mumbai | 400001 | West | Maharashtra | 1 |
| 1 | 1 | 36439 | 65850 | 56.19 | 64 | CT01 | 14-06-1990 | Self employed | 28-09-2018 | 1 | ... | 0yrs 0mon | 0 | 0 | R4 | ST15 | Mumbai | 400053 | West | Maharashtra | 1 |
| 2 | 1 | 36439 | 65850 | 56.19 | 64 | CT01 | 14-06-1990 | Self employed | 28-09-2018 | 1 | ... | 0yrs 0mon | 0 | 0 | R4 | ST15 | Mumbai | 422001 | West | Maharashtra | 1 |
| 3 | 10 | 34639 | 69717 | 50.49 | 34 | CT01 | 23-11-1982 | Self employed | 26-10-2018 | 1 | ... | 3yrs 10mon | 0 | 1 | R4 | ST15 | Mumbai | 400001 | West | Maharashtra | 1 |
| 4 | 10 | 34639 | 69717 | 50.49 | 34 | CT01 | 23-11-1982 | Self employed | 26-10-2018 | 1 | ... | 3yrs 10mon | 0 | 1 | R4 | ST15 | Mumbai | 400053 | West | Maharashtra | 1 |
5 rows × 27 columns
import missingno as ms
ms.bar(df)
<AxesSubplot:>
ms.matrix(df)
<AxesSubplot:>
df.isnull().sum()
Loan_Id 0 Disbursed_Amount 0 Asset_Cost 0 ltv 0 Branch_Id 0 City_Code 0 Date_of_Birth 0 Employment_Type 1141 DisbursalDate 0 MobileNo_Avl_Flag 0 Aadhar_flag 0 PAN_flag 0 VoterID_flag 0 Driving_flag 0 Passport_flag 0 PERFORM_CNS.SCORE 0 DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS 0 CREDIT.HISTORY.LENGTH 0 NO.OF_INQUIRIES 0 Loan_Default 0 Region_ID 0 State_Code_y 0 City_Name_x 0 Postal_Code_y 0 Region 0 State Name 0 Self employed 0 dtype: int64
df.duplicated().sum()
0
df.drop_duplicates()
| Loan_Id | Disbursed_Amount | Asset_Cost | ltv | Branch_Id | City_Code | Date_of_Birth | Employment_Type | DisbursalDate | MobileNo_Avl_Flag | ... | CREDIT.HISTORY.LENGTH | NO.OF_INQUIRIES | Loan_Default | Region_ID | State_Code_y | City_Name_x | Postal_Code_y | Region | State Name | Self employed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 36439 | 65850 | 56.19 | 64 | CT01 | 14-06-1990 | Self employed | 28-09-2018 | 1 | ... | 0yrs 0mon | 0 | 0 | R4 | ST15 | Mumbai | 400001 | West | Maharashtra | 1 |
| 1 | 1 | 36439 | 65850 | 56.19 | 64 | CT01 | 14-06-1990 | Self employed | 28-09-2018 | 1 | ... | 0yrs 0mon | 0 | 0 | R4 | ST15 | Mumbai | 400053 | West | Maharashtra | 1 |
| 2 | 1 | 36439 | 65850 | 56.19 | 64 | CT01 | 14-06-1990 | Self employed | 28-09-2018 | 1 | ... | 0yrs 0mon | 0 | 0 | R4 | ST15 | Mumbai | 422001 | West | Maharashtra | 1 |
| 3 | 10 | 34639 | 69717 | 50.49 | 34 | CT01 | 23-11-1982 | Self employed | 26-10-2018 | 1 | ... | 3yrs 10mon | 0 | 1 | R4 | ST15 | Mumbai | 400001 | West | Maharashtra | 1 |
| 4 | 10 | 34639 | 69717 | 50.49 | 34 | CT01 | 23-11-1982 | Self employed | 26-10-2018 | 1 | ... | 3yrs 10mon | 0 | 1 | R4 | ST15 | Mumbai | 400053 | West | Maharashtra | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 33897 | 23130 | 59913 | 69678 | 87.55 | 5 | CT05 | 16-04-1994 | Salaried | 20-09-2018 | 1 | ... | 0yrs 0mon | 0 | 0 | R2 | ST25 | Hyderabad | 500045 | South | Telangana | 0 |
| 33898 | 23250 | 49078 | 68780 | 74.15 | 251 | CT05 | 11-01-1970 | Self employed | 09-08-2018 | 1 | ... | 0yrs 0mon | 0 | 1 | R2 | ST25 | Hyderabad | 500004 | South | Telangana | 1 |
| 33899 | 23250 | 49078 | 68780 | 74.15 | 251 | CT05 | 11-01-1970 | Self employed | 09-08-2018 | 1 | ... | 0yrs 0mon | 0 | 1 | R2 | ST25 | Hyderabad | 500045 | South | Telangana | 1 |
| 33900 | 23254 | 56174 | 76264 | 76.71 | 3 | CT05 | 01-12-1969 | Salaried | 13-10-2018 | 1 | ... | 4yrs 10mon | 0 | 0 | R2 | ST25 | Hyderabad | 500004 | South | Telangana | 0 |
| 33901 | 23254 | 56174 | 76264 | 76.71 | 3 | CT05 | 01-12-1969 | Salaried | 13-10-2018 | 1 | ... | 4yrs 10mon | 0 | 0 | R2 | ST25 | Hyderabad | 500045 | South | Telangana | 0 |
33902 rows × 27 columns
df['Employment_Type'].value_counts()
Self employed 18376 Salaried 14385 Name: Employment_Type, dtype: int64
df['Employment_Type'].fillna('Unemployed', inplace = True)
df.isnull().sum()
Loan_Id 0 Disbursed_Amount 0 Asset_Cost 0 ltv 0 Branch_Id 0 City_Code 0 Date_of_Birth 0 Employment_Type 0 DisbursalDate 0 MobileNo_Avl_Flag 0 Aadhar_flag 0 PAN_flag 0 VoterID_flag 0 Driving_flag 0 Passport_flag 0 PERFORM_CNS.SCORE 0 DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS 0 CREDIT.HISTORY.LENGTH 0 NO.OF_INQUIRIES 0 Loan_Default 0 Region_ID 0 State_Code_y 0 City_Name_x 0 Postal_Code_y 0 Region 0 State Name 0 Self employed 0 dtype: int64
df.duplicated().sum()
0
df.shape
(33902, 27)
df.head()
| Loan_Id | Disbursed_Amount | Asset_Cost | ltv | Branch_Id | City_Code | Date_of_Birth | Employment_Type | DisbursalDate | MobileNo_Avl_Flag | ... | CREDIT.HISTORY.LENGTH | NO.OF_INQUIRIES | Loan_Default | Region_ID | State_Code_y | City_Name_x | Postal_Code_y | Region | State Name | Self employed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 36439 | 65850 | 56.19 | 64 | CT01 | 14-06-1990 | Self employed | 28-09-2018 | 1 | ... | 0yrs 0mon | 0 | 0 | R4 | ST15 | Mumbai | 400001 | West | Maharashtra | 1 |
| 1 | 1 | 36439 | 65850 | 56.19 | 64 | CT01 | 14-06-1990 | Self employed | 28-09-2018 | 1 | ... | 0yrs 0mon | 0 | 0 | R4 | ST15 | Mumbai | 400053 | West | Maharashtra | 1 |
| 2 | 1 | 36439 | 65850 | 56.19 | 64 | CT01 | 14-06-1990 | Self employed | 28-09-2018 | 1 | ... | 0yrs 0mon | 0 | 0 | R4 | ST15 | Mumbai | 422001 | West | Maharashtra | 1 |
| 3 | 10 | 34639 | 69717 | 50.49 | 34 | CT01 | 23-11-1982 | Self employed | 26-10-2018 | 1 | ... | 3yrs 10mon | 0 | 1 | R4 | ST15 | Mumbai | 400001 | West | Maharashtra | 1 |
| 4 | 10 | 34639 | 69717 | 50.49 | 34 | CT01 | 23-11-1982 | Self employed | 26-10-2018 | 1 | ... | 3yrs 10mon | 0 | 1 | R4 | ST15 | Mumbai | 400053 | West | Maharashtra | 1 |
5 rows × 27 columns
df.to_csv('l&t_loan_default.csv')
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 33902 entries, 0 to 33901 Data columns (total 27 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Loan_Id 33902 non-null int64 1 Disbursed_Amount 33902 non-null int64 2 Asset_Cost 33902 non-null int64 3 ltv 33902 non-null float64 4 Branch_Id 33902 non-null int64 5 City_Code 33902 non-null object 6 Date_of_Birth 33902 non-null object 7 Employment_Type 33902 non-null object 8 DisbursalDate 33902 non-null object 9 MobileNo_Avl_Flag 33902 non-null int64 10 Aadhar_flag 33902 non-null int64 11 PAN_flag 33902 non-null int64 12 VoterID_flag 33902 non-null int64 13 Driving_flag 33902 non-null int64 14 Passport_flag 33902 non-null int64 15 PERFORM_CNS.SCORE 33902 non-null int64 16 DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS 33902 non-null int64 17 CREDIT.HISTORY.LENGTH 33902 non-null object 18 NO.OF_INQUIRIES 33902 non-null int64 19 Loan_Default 33902 non-null int64 20 Region_ID 33902 non-null object 21 State_Code_y 33902 non-null object 22 City_Name_x 33902 non-null object 23 Postal_Code_y 33902 non-null int64 24 Region 33902 non-null object 25 State Name 33902 non-null object 26 Self employed 33902 non-null uint8 dtypes: float64(1), int64(15), object(10), uint8(1) memory usage: 7.0+ MB
df.describe()
| Loan_Id | Disbursed_Amount | Asset_Cost | ltv | Branch_Id | MobileNo_Avl_Flag | Aadhar_flag | PAN_flag | VoterID_flag | Driving_flag | Passport_flag | PERFORM_CNS.SCORE | DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS | NO.OF_INQUIRIES | Loan_Default | Postal_Code_y | Self employed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 33902.000000 | 33902.000000 | 33902.000000 | 33902.000000 | 33902.000000 | 33902.0 | 33902.000000 | 33902.000000 | 33902.000000 | 33902.000000 | 33902.000000 | 33902.000000 | 33902.000000 | 33902.000000 | 33902.000000 | 33902.000000 | 33902.000000 |
| mean | 11640.221786 | 54207.540971 | 75740.440416 | 74.678320 | 72.115922 | 1.0 | 0.848151 | 0.072503 | 0.138399 | 0.023420 | 0.002035 | 294.760516 | 0.099050 | 0.210814 | 0.218748 | 544488.096720 | 0.542033 |
| std | 6754.761740 | 12912.628712 | 18783.082337 | 11.537117 | 69.093252 | 0.0 | 0.358880 | 0.259323 | 0.345323 | 0.151237 | 0.045069 | 339.628968 | 0.387384 | 0.712340 | 0.413403 | 175517.046957 | 0.498237 |
| min | 1.000000 | 13600.000000 | 37230.000000 | 17.130000 | 1.000000 | 1.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 110039.000000 | 0.000000 |
| 25% | 5792.000000 | 46949.000000 | 65599.250000 | 68.810000 | 13.000000 | 1.0 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 400053.000000 | 0.000000 |
| 50% | 11594.500000 | 53678.000000 | 70854.000000 | 76.750000 | 61.000000 | 1.0 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 14.000000 | 0.000000 | 0.000000 | 0.000000 | 560034.000000 | 1.000000 |
| 75% | 17559.000000 | 60245.500000 | 79192.000000 | 83.670000 | 130.000000 | 1.0 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 680.000000 | 0.000000 | 0.000000 | 0.000000 | 680503.000000 | 1.000000 |
| max | 23315.000000 | 592460.000000 | 715186.000000 | 94.980000 | 261.000000 | 1.0 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 879.000000 | 7.000000 | 23.000000 | 1.000000 | 835204.000000 | 1.000000 |
df.hist(figsize=(15,12),bins = 15)
plt.show()
sns.countplot(y="Loan_Default" ,data=df)
plt.xlabel("Count of each Target class")
plt.ylabel("Target classes")
plt.show()
pct_loan_default = df['Loan_Default'].value_counts(normalize=True)*100
pct_loan_default
0 78.125184 1 21.874816 Name: Loan_Default, dtype: float64
import plotly.express as px
fig = px.pie(values=pct_loan_default.values, names=['Not defaulted','Defaulted'])
fig.show()
78.3% observations in the dataset have no defaulted loan while 21.7% have defaulted loan. It is slighly imbalanced dataset.
sub_data1 = df.loc[:, ['State Name']]
stadium = sub_data1['State Name'].value_counts().head(15).keys().tolist()
values = sub_data1['State Name'].value_counts().head(15).tolist()
plt.gcf().set_size_inches(15,5)
plt.xlabel('States')
plt.ylabel('Loan')
plt.title('State vs No. Loan Default')
plt.bar( stadium,values, color = 'black')
plt.gcf().autofmt_xdate()
plt.show()
sub_data1 = df.loc[:, ['City_Name_x']]
stadium = sub_data1['City_Name_x'].value_counts().head(15).keys().tolist()
values = sub_data1['City_Name_x'].value_counts().head(15).tolist()
plt.gcf().set_size_inches(15,5)
plt.xlabel('City')
plt.ylabel('Loan')
plt.title('City vs No. Loan Default')
plt.bar( stadium,values, color = 'skyblue')
plt.gcf().autofmt_xdate()
plt.show()
sub_data1 = df.loc[:, ['Region']]
stadium = sub_data1['Region'].value_counts().head(15).keys().tolist()
values = sub_data1['Region'].value_counts().head(15).tolist()
plt.gcf().set_size_inches(15,5)
plt.xlabel('Region')
plt.ylabel('Loan')
plt.title('Region vs No. Loan Default')
plt.bar( stadium,values, color = 'violet')
plt.gcf().autofmt_xdate()
plt.show()
df.plot.scatter(x="DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS",y="State Name",color='b')
plt.show()
sns.histplot(x='NO.OF_INQUIRIES',hue='Loan_Default',data=df,multiple='dodge')
<AxesSubplot:xlabel='NO.OF_INQUIRIES', ylabel='Count'>
sns.countplot(x="DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS", hue="Loan_Default", data=df)
<AxesSubplot:xlabel='DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', ylabel='count'>
sns.countplot(x="Region", hue="Loan_Default", data=df)
<AxesSubplot:xlabel='Region', ylabel='count'>
# loans defaulted in last six months
df['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS'].value_counts()
0 31254 1 2148 2 360 3 100 4 19 5 14 6 5 7 2 Name: DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS, dtype: int64
# plotting countplot for credit history of users
plt.rcParams['figure.figsize'] = (18, 5)
sns.countplot(df['CREDIT.HISTORY.LENGTH'].head(100))
plt.title('Credit History')
plt.xticks(rotation = 45)
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
# Finding the relations between the variables.
plt.figure(figsize=(20,10))
c= df.corr()
sns.heatmap(c,cmap="BrBG",annot=True)
c
| Loan_Id | Disbursed_Amount | Asset_Cost | ltv | Branch_Id | MobileNo_Avl_Flag | Aadhar_flag | PAN_flag | VoterID_flag | Driving_flag | Passport_flag | PERFORM_CNS.SCORE | DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS | NO.OF_INQUIRIES | Loan_Default | Postal_Code_y | Self employed | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Loan_Id | 1.000000 | -0.010443 | -0.006060 | -0.006236 | 0.000451 | NaN | 0.004081 | -0.002740 | -0.002796 | 0.002814 | -0.006450 | 0.010577 | 0.001240 | -0.005989 | -0.007867 | -0.003031 | 0.007667 |
| Disbursed_Amount | -0.010443 | 1.000000 | 0.743790 | 0.382782 | 0.026510 | NaN | -0.020633 | 0.010374 | 0.019912 | -0.002494 | -0.001403 | 0.008429 | 0.016384 | 0.029753 | 0.063384 | 0.006900 | -0.038269 |
| Asset_Cost | -0.006060 | 0.743790 | 1.000000 | -0.306615 | 0.022288 | NaN | -0.095394 | 0.039575 | 0.090969 | 0.015646 | -0.007224 | -0.053998 | -0.003329 | -0.023332 | 0.003411 | 0.001536 | 0.037164 |
| ltv | -0.006236 | 0.382782 | -0.306615 | 1.000000 | 0.015304 | NaN | 0.098749 | -0.028779 | -0.092375 | -0.023206 | 0.008698 | 0.088182 | 0.034832 | 0.085053 | 0.093080 | 0.006074 | -0.100826 |
| Branch_Id | 0.000451 | 0.026510 | 0.022288 | 0.015304 | 1.000000 | NaN | -0.044395 | 0.020123 | 0.029793 | -0.016579 | -0.010003 | -0.022520 | -0.000832 | -0.001061 | 0.036352 | 0.000695 | -0.012531 |
| MobileNo_Avl_Flag | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Aadhar_flag | 0.004081 | -0.020633 | -0.095394 | 0.098749 | -0.044395 | NaN | 1.000000 | -0.174564 | -0.864610 | -0.285559 | -0.079373 | 0.067861 | 0.023320 | 0.026223 | -0.047694 | -0.005057 | -0.090999 |
| PAN_flag | -0.002740 | 0.010374 | 0.039575 | -0.028779 | 0.020123 | NaN | -0.174564 | 1.000000 | 0.155086 | 0.010103 | 0.002517 | 0.025517 | 0.002212 | 0.023764 | 0.008892 | 0.005900 | 0.013397 |
| VoterID_flag | -0.002796 | 0.019912 | 0.090969 | -0.092375 | 0.029793 | NaN | -0.864610 | 0.155086 | 1.000000 | -0.055289 | -0.018100 | -0.073270 | -0.023317 | -0.015486 | 0.039183 | -0.001752 | 0.090657 |
| Driving_flag | 0.002814 | -0.002494 | 0.015646 | -0.023206 | -0.016579 | NaN | -0.285559 | 0.010103 | -0.055289 | 1.000000 | -0.006994 | 0.006913 | 0.000682 | -0.028308 | 0.004394 | -0.005859 | 0.007683 |
| Passport_flag | -0.006450 | -0.001403 | -0.007224 | 0.008698 | -0.010003 | NaN | -0.079373 | 0.002517 | -0.018100 | -0.006994 | 1.000000 | 0.012627 | -0.006479 | -0.012446 | -0.004898 | 0.000543 | -0.013662 |
| PERFORM_CNS.SCORE | 0.010577 | 0.008429 | -0.053998 | 0.088182 | -0.022520 | NaN | 0.067861 | 0.025517 | -0.073270 | 0.006913 | 0.012627 | 1.000000 | 0.153743 | 0.138564 | -0.064395 | -0.015129 | -0.037151 |
| DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS | 0.001240 | 0.016384 | -0.003329 | 0.034832 | -0.000832 | NaN | 0.023320 | 0.002212 | -0.023317 | 0.000682 | -0.006479 | 0.153743 | 1.000000 | 0.069491 | 0.030842 | -0.001252 | 0.013427 |
| NO.OF_INQUIRIES | -0.005989 | 0.029753 | -0.023332 | 0.085053 | -0.001061 | NaN | 0.026223 | 0.023764 | -0.015486 | -0.028308 | -0.012446 | 0.138564 | 0.069491 | 1.000000 | 0.044435 | -0.007495 | 0.004412 |
| Loan_Default | -0.007867 | 0.063384 | 0.003411 | 0.093080 | 0.036352 | NaN | -0.047694 | 0.008892 | 0.039183 | 0.004394 | -0.004898 | -0.064395 | 0.030842 | 0.044435 | 1.000000 | 0.012550 | 0.027108 |
| Postal_Code_y | -0.003031 | 0.006900 | 0.001536 | 0.006074 | 0.000695 | NaN | -0.005057 | 0.005900 | -0.001752 | -0.005859 | 0.000543 | -0.015129 | -0.001252 | -0.007495 | 0.012550 | 1.000000 | 0.004455 |
| Self employed | 0.007667 | -0.038269 | 0.037164 | -0.100826 | -0.012531 | NaN | -0.090999 | 0.013397 | 0.090657 | 0.007683 | -0.013662 | -0.037151 | 0.013427 | 0.004412 | 0.027108 | 0.004455 | 1.000000 |
#Employment_Type
pd.crosstab(df['Employment_Type'],df['Loan_Default'])
| Loan_Default | 0 | 1 |
|---|---|---|
| Employment_Type | ||
| Salaried | 11449 | 2936 |
| Self employed | 14167 | 4209 |
| Unemployed | 870 | 271 |
df1 = pd.crosstab(df['Employment_Type'],df['Loan_Default'])
fig = px.bar(df1, barmode = 'group',width=600,height=400)
fig.show()
print('Percentage of salaried people who have defaulted:',
np.round(df[(df['Employment_Type']=='Salaried') & (df['Loan_Default']==1)].shape[0]/(df[df['Employment_Type']=='Salaried'].shape[0])*100,3))
print('Percentage of self employed people who have defaulted:',
np.round(df[(df['Employment_Type']=='Self employed') & (df['Loan_Default']==1)].shape[0]/(df[df['Employment_Type']=='Self employed'].shape[0])*100,3))
Percentage of salaried people who have defaulted: 20.41 Percentage of self employed people who have defaulted: 22.905
#MobileNo Avl Flag
pd.crosstab(df['MobileNo_Avl_Flag'],df['Loan_Default'])
| Loan_Default | 0 | 1 |
|---|---|---|
| MobileNo_Avl_Flag | ||
| 1 | 26486 | 7416 |
df1 = pd.crosstab(df['MobileNo_Avl_Flag'],df['Loan_Default'])
fig = px.bar(df1, barmode = 'group',width=600,height=400)
fig.show()
print('Percentage of salaried people who have defaulted:',
np.round(df[(df['MobileNo_Avl_Flag']==1) & (df['Loan_Default']==1)].shape[0]/(df[df['MobileNo_Avl_Flag']==1].shape[0])*100,3))
print('Percentage of self employed people who have defaulted:',
np.round(df[(df['MobileNo_Avl_Flag']==0) & (df['Loan_Default']==1)].shape[0]/(df[df['MobileNo_Avl_Flag']==1].shape[0])*100,3))
Percentage of salaried people who have defaulted: 21.875 Percentage of self employed people who have defaulted: 0.0
#Aadhar
pd.crosstab(df['Aadhar_flag'],df['Loan_Default'])
| Loan_Default | 0 | 1 |
|---|---|---|
| Aadhar_flag | ||
| 0 | 3782 | 1366 |
| 1 | 22704 | 6050 |
df2= pd.crosstab(df['Aadhar_flag'],df['Loan_Default'])
fig = px.bar(df2, barmode = 'group',width=600,height=400)
fig.show()
print('Percentage of people who have given their Aadhar details and defaulted:',
np.round(df[(df['Aadhar_flag']==1) & (df['Loan_Default']==1)].shape[0]/(df[df['Aadhar_flag']==1].shape[0])*100,3))
print('Percentage of people who have not given their Aadhar details and defaulted:',
np.round(df[(df['Aadhar_flag']==0) & (df['Loan_Default']==1)].shape[0]/(df[df['Aadhar_flag']==0].shape[0])*100,3))
Percentage of people who have given their Aadhar details and defaulted: 21.041 Percentage of people who have not given their Aadhar details and defaulted: 26.535
#PAN
pd.crosstab(df['PAN_flag'],df['Loan_Default'])
| Loan_Default | 0 | 1 |
|---|---|---|
| PAN_flag | ||
| 0 | 24598 | 6846 |
| 1 | 1888 | 570 |
df3 = pd.crosstab(df['PAN_flag'],df['Loan_Default'])
fig = px.bar(df3, barmode = 'group',width=600,height=400)
fig.show()
print('Percentage of people who have given their PAN details and defaulted:',
np.round(df[(df['PAN_flag']==1) & (df['Loan_Default']==1)].shape[0]/(df[df['PAN_flag']==1].shape[0])*100,3))
print('Percentage of people who have not given their PAN details and defaulted:',
np.round(df[(df['PAN_flag']==0) & (df['Loan_Default']==1)].shape[0]/(df[df['PAN_flag']==0].shape[0])*100,3))
Percentage of people who have given their PAN details and defaulted: 23.19 Percentage of people who have not given their PAN details and defaulted: 21.772
#Voter ID
pd.crosstab(df['VoterID_flag'],df['Loan_Default'])
| Loan_Default | 0 | 1 |
|---|---|---|
| VoterID_flag | ||
| 0 | 23010 | 6200 |
| 1 | 3476 | 1216 |
df4= pd.crosstab(df['VoterID_flag'],df['Loan_Default'])
fig = px.bar(df4, barmode = 'group',width=600,height=400)
fig.show()
print('Percentage of people who have given their voter_id details and defaulted:',
np.round(df[(df['VoterID_flag']==1) & (df['Loan_Default']==1)].shape[0]/(df[df['VoterID_flag']==1].shape[0])*100,3))
print('Percentage of people who have not given their voter_id details and defaulted:',
np.round(df[(df['VoterID_flag']==0) & (df['Loan_Default']==1)].shape[0]/(df[df['VoterID_flag']==0].shape[0])*100,3))
Percentage of people who have given their voter_id details and defaulted: 25.916 Percentage of people who have not given their voter_id details and defaulted: 21.226
#Driving
pd.crosstab(df['Driving_flag'],df['Loan_Default'])
| Loan_Default | 0 | 1 |
|---|---|---|
| Driving_flag | ||
| 0 | 25875 | 7233 |
| 1 | 611 | 183 |
df5= pd.crosstab(df['Driving_flag'],df['Loan_Default'])
fig = px.bar(df5, barmode = 'group',width=600,height=400)
fig.show()
print('Percentage of people who have given their DL details and defaulted:',
np.round(df[(df['Driving_flag']==1) & (df['Loan_Default']==1)].shape[0]/(df[df['Driving_flag']==1].shape[0])*100,3))
print('Percentage of people who have not given their DL details and defaulted:',
np.round(df[(df['Driving_flag']==0) & (df['Loan_Default']==1)].shape[0]/(df[df['Driving_flag']==0].shape[0])*100,3))
Percentage of people who have given their DL details and defaulted: 23.048 Percentage of people who have not given their DL details and defaulted: 21.847
#Passport
pd.crosstab(df['Passport_flag'],df['Loan_Default'])
| Loan_Default | 0 | 1 |
|---|---|---|
| Passport_flag | ||
| 0 | 26429 | 7404 |
| 1 | 57 | 12 |
df6= pd.crosstab(df['Passport_flag'],df['Loan_Default'])
fig = px.bar(df6, barmode = 'group',width=600,height=400)
fig.show()
print('Percentage of people who have given their Passport details and defaulted:',
np.round(df[(df['Passport_flag']==1) & (df['Loan_Default']==1)].shape[0]/(df[df['Passport_flag']==1].shape[0])*100,3))
print('Percentage of people who have not given their Passport details and defaulted:',
np.round(df[(df['Passport_flag']==0) & (df['Loan_Default']==1)].shape[0]/(df[df['Passport_flag']==0].shape[0])*100,3))
Percentage of people who have given their Passport details and defaulted: 17.391 Percentage of people who have not given their Passport details and defaulted: 21.884
Main focus of the Hypothesis Testing is to draw relations and infer. insights between loan default and different features in this dataset (df)
Normality:The population data sould follow normal ( by default alpha =5%)
H0: pop data = Normal
H1: pop data != normal
Note: if p value >=alpha --> pop data = Normal
sns.distplot(df.ltv)
<AxesSubplot:xlabel='ltv', ylabel='Density'>
st.shapiro(df.ltv)
ShapiroResult(statistic=0.9293789863586426, pvalue=0.0)
Sicne p value < alpha (0.05) --> reject H0.
Hence we can conclude that pop data of LTV ratio does not folow normal distribution
So we can not do the parametric test. proceed for non parametric test
Test of Median
H0: pop LTV >=50
H1:pop LTV <50
pop_median=50
alpha = 0.05
diff_median = df.ltv-pop_median
## Wilcoxon test
st.wilcoxon(diff_median, alternative='less')
WilcoxonResult(statistic=570401111.5, pvalue=1.0)
WilcoxonResult(statistic=570401111.5, pvalue=1.0)
sicne p value( close to 0) is greater than alpha( 5%) --> Accept H0
Inference: We can conclude that the pop median of Loan to value (LTV) is >50.
Prob: Test whether the ltv value is same for Employment_Type?
sns.countplot(x="Employment_Type", hue="Loan_Default", data=df)
<AxesSubplot:xlabel='Employment_Type', ylabel='count'>
df.Loan_Default.value_counts()
0 26486 1 7416 Name: Loan_Default, dtype: int64
sns.countplot(df.Loan_Default)
<AxesSubplot:xlabel='Loan_Default', ylabel='count'>
H0: P=0.3 : The Default Loan propotion rate is 30%
H1: P!=0.3 : The Default Loan rate is not 30%
alpha=0.05
P=0.3
x=2347.6
n=32761
p=x/n
q=1-p
se= np.sqrt(p*q/n)
z_st = (p-P)/se
z_crit= st.norm.isf(alpha/2)
print(z_st, z_crit)
-160.24197019005072 1.9599639845400545
print(" pvalue is", st.norm.cdf(z_st)*2)
pvalue is 0.0
from statsmodels.stats.proportion import proportions_ztest
proportions_ztest(x,n,P)
(-160.24197019005072, 0.0)
Since p value < alpha(0.05)--> Reject H0. The loan default rate is not 30 %.
Test whether the loan proportion is different for different Employment type?
H0: P_Salaried= P_Self Employed; P_salaried-P_self emplyed=0 : There is no difference in loan proportion
H1: P_Salaried != P_Self Employed ; P_salaried-P_self Employed!=0 : There is difference in loan proportion
pd.crosstab(df.Employment_Type,df.Loan_Default)
| Loan_Default | 0 | 1 |
|---|---|---|
| Employment_Type | ||
| Salaried | 11449 | 2936 |
| Self employed | 14167 | 4209 |
| Unemployed | 870 | 271 |
sns.countplot(x="Employment_Type", hue="Loan_Default", data=df)
<AxesSubplot:xlabel='Employment_Type', ylabel='count'>
df["Employment_Type"].describe()
count 33902 unique 3 top Self employed freq 18376 Name: Employment_Type, dtype: object
alpha=0.05
P_diff=0
xm= 11449
nm =18376
pm=xm/nm
xf=14167
nf=14385
pf=xf/nf
n=32761
p=(xm+xf)/(nm+nf)
q=1-p
se= np.sqrt(p*q*((1/nm)+(1/nf)))
z_st = ((pm-pf)-(P_diff))/se
z_crit= st.norm.isf(alpha/2)
print(z_st, z_crit)
-78.70024272701211 1.9599639845400545
print("p value is ", st.norm.sf(z_st)*2)
p value is 2.0
proportions_ztest([xm,xf],[nm,nf])
(-78.70024272701211, 0.0)
Since p value (0.0) < alpha(0.05) --> reject H0
We can conclude that there is difference in Loan Default proportion of Salaried and Self Employed
Is there any assoication b/w Loan Default and Region?
Ho:There in no assoication or independency b/w Loan Default and Region
H1:There is an association or dependency b/w Loan Default and Region
table= pd.crosstab(df.Loan_Default,df.Region)
table
| Region | East | North | South | West |
|---|---|---|---|---|
| Loan_Default | ||||
| 0 | 4931 | 2511 | 11971 | 7073 |
| 1 | 1437 | 676 | 3388 | 1915 |
sns.countplot(x="Region", hue="Loan_Default", data=df)
<AxesSubplot:xlabel='Region', ylabel='count'>
st.chi2_contingency(table)
(4.605765007227732,
0.20304808764047969,
3,
array([[ 4975.01173972, 2489.84962539, 11999.24706507, 7021.89156982],
[ 1392.98826028, 697.15037461, 3359.75293493, 1966.10843018]]))
## p value
st.chi2.sf(3.647580215555611,3)
0.30211981048019254
## Chi square critcal
st.chi2.isf(alpha,1)
3.8414588206941285
Chisquare Value Analysis:
chisquare_stat>chi square_critical ---> reject H0
PVA:
p-value <alpha(0.05)-->Reject H0
hence we can conclude that there is an association b/w Loan Default and region in this given dataset
df.drop(['PAN_flag','Driving_flag','Passport_flag','Employment_Type','City_Code',
'Date_of_Birth', 'DisbursalDate', 'CREDIT.HISTORY.LENGTH', 'Region_ID', 'State_Code_y',
'City_Name_x', 'Postal_Code_y', 'Region', 'State Name'],axis=1,inplace=True)
df.columns
Index(['Loan_Id', 'Disbursed_Amount', 'Asset_Cost', 'ltv', 'Branch_Id',
'MobileNo_Avl_Flag', 'Aadhar_flag', 'VoterID_flag', 'PERFORM_CNS.SCORE',
'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'NO.OF_INQUIRIES',
'Loan_Default', 'Self employed'],
dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 33902 entries, 0 to 33901 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Loan_Id 33902 non-null int64 1 Disbursed_Amount 33902 non-null int64 2 Asset_Cost 33902 non-null int64 3 ltv 33902 non-null float64 4 Branch_Id 33902 non-null int64 5 MobileNo_Avl_Flag 33902 non-null int64 6 Aadhar_flag 33902 non-null int64 7 VoterID_flag 33902 non-null int64 8 PERFORM_CNS.SCORE 33902 non-null int64 9 DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS 33902 non-null int64 10 NO.OF_INQUIRIES 33902 non-null int64 11 Loan_Default 33902 non-null int64 12 Self employed 33902 non-null uint8 dtypes: float64(1), int64(11), uint8(1) memory usage: 4.4 MB
x= df.drop('Loan_Default', axis=1)
y= df.Loan_Default
x = pd.get_dummies(x,drop_first=True)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
for col in x.columns:
x[col] = sc.fit_transform(x[col].values.reshape(-1,1))
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(x,y, test_size=0.2,random_state=0)
print(f'xtrain shape:{x_train.shape}')
print(f'ytrain shape:{y_train.shape}')
print(f'xtest shape:{x_test.shape}')
print(f'ytest shape:{y_test.shape}')
xtrain shape:(27121, 12) ytrain shape:(27121,) xtest shape:(6781, 12) ytest shape:(6781,)
log_model= LogisticRegression()
log_model.fit(x_train,y_train)
LogisticRegression()
prob_pred_y = log_model.predict_proba(x_test)
prob_pred_y
array([[0.88799453, 0.11200547],
[0.81002671, 0.18997329],
[0.78446481, 0.21553519],
...,
[0.84474411, 0.15525589],
[0.60501115, 0.39498885],
[0.80879084, 0.19120916]])
pred_y = log_model.predict(x_test)
pred_y
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
accuracy_score(y_test,pred_y)
0.7787936882465712
f1_score(y_test,pred_y)
0.005305039787798409
precision_score(y_test,pred_y)
0.36363636363636365
recall_score(y_test,pred_y)
0.0026720106880427524
print(classification_report(y_test,pred_y))
precision recall f1-score support
0 0.78 1.00 0.88 5284
1 0.36 0.00 0.01 1497
accuracy 0.78 6781
macro avg 0.57 0.50 0.44 6781
weighted avg 0.69 0.78 0.68 6781
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,pred_y)
0.5006736283568904
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,pred_y)
array([[5277, 7],
[1493, 4]], dtype=int64)
cohen_kappa_score(y_test, pred_y)
0.002091081030223818
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(log_model, x_test, y_test, normalize='true')
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x18bd6bdcbb0>
from sklearn.metrics import plot_roc_curve
plot_roc_curve(log_model, x_test, y_test)
<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x18bc1d8bac0>
log_model.coef_
array([[-0.01260658, -0.14055189, 0.20372014, 0.39771263, 0.08238115,
0. , -0.13505059, -0.02421018, -0.19598684, 0.08100523,
0.10856729, 0.08812856]])
y = df['Loan_Default']
X = df.drop('Loan_Default',axis=1)
# Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scaled = sc.fit_transform(X)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.3,random_state=120)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear',random_state=42)
lr.fit(X_train,y_train)
LogisticRegression(random_state=42, solver='liblinear')
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
y_train_prob = lr.predict_proba(X_train)
y_test_prob = lr.predict_proba(X_test)
print('The train AUC score is:',roc_auc_score(y_train,y_train_prob[:,1]))
print('The test AUC score is:',roc_auc_score(y_test,y_test_prob[:,1]))
The train AUC score is: 0.60498187532925 The test AUC score is: 0.6074952418431273
import sklearn
from sklearn import metrics
fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test, y_test_prob[:,1])
plt.plot(fpr,fpr)
plt.plot(fpr,tpr)
plt.grid()
plt.title('Test ROC curve')
plt.show()
confusion_matrix(y_test,y_test_pred)
array([[7937, 15],
[2208, 11]], dtype=int64)
sns.heatmap(confusion_matrix(y_test,y_test_pred),annot=True)
plt.show()
from sklearn.metrics import classification_report
print('Test Classification Report\n')
print(classification_report(y_test,y_test_pred))
Test Classification Report
precision recall f1-score support
0 0.78 1.00 0.88 7952
1 0.42 0.00 0.01 2219
accuracy 0.78 10171
macro avg 0.60 0.50 0.44 10171
weighted avg 0.70 0.78 0.69 10171
from sklearn.calibration import CalibratedClassifierCV
model_isotonic = CalibratedClassifierCV(lr, cv=3, method='isotonic')
model_isotonic.fit(X_train,y_train)
CalibratedClassifierCV(base_estimator=LogisticRegression(random_state=42,
solver='liblinear'),
cv=3, method='isotonic')
isotonic_predict_prob_test = model_isotonic.predict_proba(X_test)
sklearn.metrics.log_loss(y_test,isotonic_predict_prob_test)
0.5190432301491877
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train_sm,y_train_sm = smote.fit_resample(X_train,y_train)
X_train_sm.shape, y_train_sm.shape
((37068, 12), (37068,))
rsearch1_best_params = {'max_depth': 13,
'min_samples_leaf': 10,
'min_samples_split': 11,
'n_estimators': 374}
from sklearn.ensemble import RandomForestClassifier
rfc1 = RandomForestClassifier(**rsearch1_best_params, random_state=300)
rfc1.fit(X_train, y_train)
RandomForestClassifier(max_depth=13, min_samples_leaf=10, min_samples_split=11,
n_estimators=374, random_state=300)
y_train_pred = rfc1.predict(X_train)
y_test_pred = rfc1.predict(X_test)
y_train_prob = rfc1.predict_proba(X_train)
y_test_prob = rfc1.predict_proba(X_test)
print('The train AUC score is:',roc_auc_score(y_train,y_train_prob[:,1]))
print('The test AUC score is:',roc_auc_score(y_test,y_test_prob[:,1]))
The train AUC score is: 0.8978686913757032 The test AUC score is: 0.749044061575401
fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test, y_test_prob[:,1])
plt.plot(fpr,fpr)
plt.plot(fpr,tpr)
plt.grid()
plt.title('Test ROC curve')
plt.show()
confusion_matrix(y_test,y_test_pred)
array([[7946, 6],
[2187, 32]], dtype=int64)
sns.heatmap(confusion_matrix(y_test,y_test_pred),annot=True)
plt.show()
from sklearn.metrics import classification_report
print('Test Classification Report\n')
print(classification_report(y_test,y_test_pred))
Test Classification Report
precision recall f1-score support
0 0.78 1.00 0.88 7952
1 0.84 0.01 0.03 2219
accuracy 0.78 10171
macro avg 0.81 0.51 0.45 10171
weighted avg 0.80 0.78 0.69 10171
from sklearn.calibration import CalibratedClassifierCV
model_isotonic = CalibratedClassifierCV(rfc1, cv=3, method='isotonic')
model_isotonic.fit(X_train,y_train)
CalibratedClassifierCV(base_estimator=RandomForestClassifier(max_depth=13,
min_samples_leaf=10,
min_samples_split=11,
n_estimators=374,
random_state=300),
cv=3, method='isotonic')
isotonic_predict_prob_test = model_isotonic.predict_proba(X_test)
sklearn.metrics.log_loss(y_test,isotonic_predict_prob_test)
0.4636111929532047
rsearch_best_params = {'max_depth': 17,
'min_samples_leaf': 2,
'min_samples_split': 4,
'n_estimators': 317}
rfc = RandomForestClassifier(**rsearch_best_params, random_state=300)
rfc.fit(X_train_sm, y_train_sm)
RandomForestClassifier(max_depth=17, min_samples_leaf=2, min_samples_split=4,
n_estimators=317, random_state=300)
y_train_pred = rfc.predict(X_train_sm)
y_test_pred = rfc.predict(X_test)
y_train_prob = rfc.predict_proba(X_train_sm)
y_test_prob = rfc.predict_proba(X_test)
print('The train AUC score is:',roc_auc_score(y_train_sm,y_train_prob[:,1]))
print('The test AUC score is:',roc_auc_score(y_test,y_test_prob[:,1]))
The train AUC score is: 0.9849373389045851 The test AUC score is: 0.8159144139283652
fpr, tpr, thresholds =sklearn.metrics.roc_curve(y_test, y_test_prob[:,1])
plt.plot(fpr,fpr)
plt.plot(fpr,tpr)
plt.grid()
plt.title('Test ROC curve')
plt.show()
confusion_matrix(y_test,y_test_pred)
array([[6380, 1572],
[ 728, 1491]], dtype=int64)
sns.heatmap(confusion_matrix(y_test,y_test_pred),annot=True)
plt.show()
from sklearn.metrics import classification_report
print('Test Classification Report\n')
print(classification_report(y_test,y_test_pred))
Test Classification Report
precision recall f1-score support
0 0.90 0.80 0.85 7952
1 0.49 0.67 0.56 2219
accuracy 0.77 10171
macro avg 0.69 0.74 0.71 10171
weighted avg 0.81 0.77 0.79 10171
from sklearn.calibration import CalibratedClassifierCV
model_isotonic = CalibratedClassifierCV(rfc, cv=3, method='isotonic')
model_isotonic.fit(X_train_sm,y_train_sm)
CalibratedClassifierCV(base_estimator=RandomForestClassifier(max_depth=17,
min_samples_leaf=2,
min_samples_split=4,
n_estimators=317,
random_state=300),
cv=3, method='isotonic')
isotonic_predict_prob_test = model_isotonic.predict_proba(X_test)
sklearn.metrics.log_loss(y_test,isotonic_predict_prob_test)
0.4663005744023142
The problem statement asks us to calculate the likelihood of a loanee/borrower defaulting on a loan. As a result, in addition to predicting whether a person is a defaultee or not, we must also predict the likelihood that a person will default on the loan.
As a result, we used AUC-score, F1-score of 1's, and Binary Log Loss as performance metrics to assess model performance.
All models give significantly lower F1-scores when SMOTE is not used (1s). This problem has been resolved by the use of SMOTE (Though the f1-scores can be controlled by selecting the appropriate threshold from the ROC curve).
Looking at the performance metrics of various models in the table above, we can see that Logistic Regression with SMOTE performs extremely well when compared to other models. It produces good AUC scores (without overfitting) and the best F1-Score (1). When compared to other models, the Binary log loss is slightly higher.
Random Forest Classifier with SMOTE is the next best model. AUC scores show that it is overfitting when compared to Logistic Regression. It does, however, have a good F1-score(1), which is slightly lower than Logistic Regression. When compared to Logistic Regression, it has a better(lower) binary log loss.